This guide accompanies Davis Vaughan’s rstudio::conf 2018 talk, “The future of time series and financial analysis in the tidyverse.”
Unfortunately some of these packages are a bit finicky in what dependencies they require.
qqmap v2.6.1 (CRAN) requires ggplot2 v2.2.1 (CRAN) and will not work with the dev version of ggplot2.
patchwork (Github only) requires and will automatically install the development version of ggplot2.
This causes problems if you want to run this code. I would advise running everything 1 chunk at a time with the CRAN version of ggplot2 until you get down to the performance summary section. At that point, install patchwork with devtools::install_github("thomasp85/patchwork") and run the rest of the performance summary.
# Business Science dev packages
# devtools::install_github("business-science/tibbletime")
# devtools::install_github("business-science/tidyquant2")
library(tibbletime)
library(tidyquant2)
# General packages
library(dplyr)
library(readr)
library(tidyr)
# Used for themes and getting data
library(tidyquant)
# For the mapping example
library(ggmap)
library(gganimate)
# For rolling linear models
library(broom)
tq_get("AAPL") %>%
tq_mutate(select = adjusted, mutate_fun = dailyReturn) %>%
ggplot(aes(x = date, y = daily.returns)) +
geom_line() +
theme_tq()
airbnb <- read_csv("../data/tomslee_airbnb_san_diego_1436_2017-07-11.csv") %>%
as_tbl_time(last_modified) %>%
arrange(last_modified) %>%
select(last_modified, price, latitude, longitude)
## Parsed with column specification:
## cols(
## room_id = col_integer(),
## survey_id = col_integer(),
## host_id = col_integer(),
## room_type = col_character(),
## country = col_character(),
## city = col_character(),
## borough = col_character(),
## neighborhood = col_character(),
## reviews = col_integer(),
## overall_satisfaction = col_double(),
## accommodates = col_integer(),
## bedrooms = col_double(),
## bathrooms = col_character(),
## price = col_double(),
## minstay = col_character(),
## last_modified = col_datetime(format = ""),
## latitude = col_double(),
## longitude = col_double(),
## location = col_character()
## )
airbnb
## # A time tibble: 9,111 x 4
## # Index: last_modified
## last_modified price latitude longitude
## <dttm> <dbl> <dbl> <dbl>
## 1 2017-07-11 15:05:35 30.0 32.7 -117
## 2 2017-07-11 15:05:36 25.0 32.8 -117
## 3 2017-07-11 15:05:36 32.0 32.7 -117
## 4 2017-07-11 15:05:36 32.0 32.8 -117
## 5 2017-07-11 15:05:36 35.0 32.7 -117
## 6 2017-07-11 15:05:36 25.0 32.7 -117
## 7 2017-07-11 15:05:36 34.0 32.9 -117
## 8 2017-07-11 15:05:36 33.0 32.7 -117
## 9 2017-07-11 15:05:36 35.0 32.8 -117
## 10 2017-07-11 15:05:36 29.0 32.8 -117
## # ... with 9,101 more rows
The dplyr way:
airbnb %>%
filter(
last_modified >= as.POSIXct("2017-07-12 02:00:00", tz = "UTC"),
last_modified <= as.POSIXct("2017-07-12 02:59:59", tz = "UTC")
)
## # A time tibble: 67 x 4
## # Index: last_modified
## last_modified price latitude longitude
## <dttm> <dbl> <dbl> <dbl>
## 1 2017-07-12 02:06:01 500 32.7 -117
## 2 2017-07-12 02:13:36 667 32.8 -117
## 3 2017-07-12 02:14:37 575 33.0 -117
## 4 2017-07-12 02:15:02 678 33.0 -117
## 5 2017-07-12 02:16:05 575 32.8 -117
## 6 2017-07-12 02:18:44 800 32.8 -117
## 7 2017-07-12 02:18:47 724 32.8 -117
## 8 2017-07-12 02:18:47 825 32.7 -117
## 9 2017-07-12 02:18:47 900 32.8 -117
## 10 2017-07-12 02:18:47 989 32.8 -117
## # ... with 57 more rows
airbnb %>%
filter_time("2017-07-12 02:00:00" ~ "2017-07-12 02:59:59")
## # A time tibble: 67 x 4
## # Index: last_modified
## last_modified price latitude longitude
## <dttm> <dbl> <dbl> <dbl>
## 1 2017-07-12 02:06:01 500 32.7 -117
## 2 2017-07-12 02:13:36 667 32.8 -117
## 3 2017-07-12 02:14:37 575 33.0 -117
## 4 2017-07-12 02:15:02 678 33.0 -117
## 5 2017-07-12 02:16:05 575 32.8 -117
## 6 2017-07-12 02:18:44 800 32.8 -117
## 7 2017-07-12 02:18:47 724 32.8 -117
## 8 2017-07-12 02:18:47 825 32.7 -117
## 9 2017-07-12 02:18:47 900 32.8 -117
## 10 2017-07-12 02:18:47 989 32.8 -117
## # ... with 57 more rows
airbnb %>%
filter_time(~"2017-07-12 02")
## # A time tibble: 67 x 4
## # Index: last_modified
## last_modified price latitude longitude
## <dttm> <dbl> <dbl> <dbl>
## 1 2017-07-12 02:06:01 500 32.7 -117
## 2 2017-07-12 02:13:36 667 32.8 -117
## 3 2017-07-12 02:14:37 575 33.0 -117
## 4 2017-07-12 02:15:02 678 33.0 -117
## 5 2017-07-12 02:16:05 575 32.8 -117
## 6 2017-07-12 02:18:44 800 32.8 -117
## 7 2017-07-12 02:18:47 724 32.8 -117
## 8 2017-07-12 02:18:47 825 32.7 -117
## 9 2017-07-12 02:18:47 900 32.8 -117
## 10 2017-07-12 02:18:47 989 32.8 -117
## # ... with 57 more rows
More examples
# Filter for all days in 2017
airbnb %>%
filter_time(~"2017")
## # A time tibble: 9,111 x 4
## # Index: last_modified
## last_modified price latitude longitude
## <dttm> <dbl> <dbl> <dbl>
## 1 2017-07-11 15:05:35 30.0 32.7 -117
## 2 2017-07-11 15:05:36 25.0 32.8 -117
## 3 2017-07-11 15:05:36 32.0 32.7 -117
## 4 2017-07-11 15:05:36 32.0 32.8 -117
## 5 2017-07-11 15:05:36 35.0 32.7 -117
## 6 2017-07-11 15:05:36 25.0 32.7 -117
## 7 2017-07-11 15:05:36 34.0 32.9 -117
## 8 2017-07-11 15:05:36 33.0 32.7 -117
## 9 2017-07-11 15:05:36 35.0 32.8 -117
## 10 2017-07-11 15:05:36 29.0 32.8 -117
## # ... with 9,101 more rows
# All days in July to the end of August
airbnb %>%
filter_time("2017-07" ~ "2017-08")
## # A time tibble: 9,111 x 4
## # Index: last_modified
## last_modified price latitude longitude
## <dttm> <dbl> <dbl> <dbl>
## 1 2017-07-11 15:05:35 30.0 32.7 -117
## 2 2017-07-11 15:05:36 25.0 32.8 -117
## 3 2017-07-11 15:05:36 32.0 32.7 -117
## 4 2017-07-11 15:05:36 32.0 32.8 -117
## 5 2017-07-11 15:05:36 35.0 32.7 -117
## 6 2017-07-11 15:05:36 25.0 32.7 -117
## 7 2017-07-11 15:05:36 34.0 32.9 -117
## 8 2017-07-11 15:05:36 33.0 32.7 -117
## 9 2017-07-11 15:05:36 35.0 32.8 -117
## 10 2017-07-11 15:05:36 29.0 32.8 -117
## # ... with 9,101 more rows
# Start through the end of December
airbnb %>%
filter_time("start" ~ "2017-12")
## # A time tibble: 9,111 x 4
## # Index: last_modified
## last_modified price latitude longitude
## <dttm> <dbl> <dbl> <dbl>
## 1 2017-07-11 15:05:35 30.0 32.7 -117
## 2 2017-07-11 15:05:36 25.0 32.8 -117
## 3 2017-07-11 15:05:36 32.0 32.7 -117
## 4 2017-07-11 15:05:36 32.0 32.8 -117
## 5 2017-07-11 15:05:36 35.0 32.7 -117
## 6 2017-07-11 15:05:36 25.0 32.7 -117
## 7 2017-07-11 15:05:36 34.0 32.9 -117
## 8 2017-07-11 15:05:36 33.0 32.7 -117
## 9 2017-07-11 15:05:36 35.0 32.8 -117
## 10 2017-07-11 15:05:36 29.0 32.8 -117
## # ... with 9,101 more rows
collapse_by(airbnb, period = "1 day")
## # A time tibble: 9,111 x 4
## # Index: last_modified
## last_modified price latitude longitude
## <dttm> <dbl> <dbl> <dbl>
## 1 2017-07-11 22:58:12 30.0 32.7 -117
## 2 2017-07-11 22:58:12 25.0 32.8 -117
## 3 2017-07-11 22:58:12 32.0 32.7 -117
## 4 2017-07-11 22:58:12 32.0 32.8 -117
## 5 2017-07-11 22:58:12 35.0 32.7 -117
## 6 2017-07-11 22:58:12 25.0 32.7 -117
## 7 2017-07-11 22:58:12 34.0 32.9 -117
## 8 2017-07-11 22:58:12 33.0 32.7 -117
## 9 2017-07-11 22:58:12 35.0 32.8 -117
## 10 2017-07-11 22:58:12 29.0 32.8 -117
## # ... with 9,101 more rows
collapse_by(airbnb, period = "1 day") %>% tail
## # A time tibble: 6 x 4
## # Index: last_modified
## last_modified price latitude longitude
## <dttm> <dbl> <dbl> <dbl>
## 1 2017-07-12 05:20:42 73.0 32.8 -117
## 2 2017-07-12 05:20:42 68.0 32.7 -117
## 3 2017-07-12 05:20:42 90.0 32.8 -117
## 4 2017-07-12 05:20:42 90.0 32.8 -117
## 5 2017-07-12 05:20:42 350 32.7 -117
## 6 2017-07-12 05:20:42 100 32.8 -117
Collapse and summarise
# Collapse by 2 hour periods, summarise median price
airbnb %>%
collapse_by(period = "2 hour") %>%
group_by(last_modified) %>%
summarise(median_price = median(price))
## # A time tibble: 8 x 2
## # Index: last_modified
## last_modified median_price
## <dttm> <dbl>
## 1 2017-07-11 15:59:42 55.0
## 2 2017-07-11 17:59:54 100
## 3 2017-07-11 19:59:57 199
## 4 2017-07-11 21:48:16 450
## 5 2017-07-11 22:58:12 152
## 6 2017-07-12 00:59:43 285
## 7 2017-07-12 03:59:26 882
## 8 2017-07-12 05:20:42 40.0
# Clean and round up
airbnb %>%
collapse_by(period = "2 hour", clean = TRUE) %>%
group_by(last_modified) %>%
summarise(median_price = median(price))
## # A time tibble: 8 x 2
## # Index: last_modified
## last_modified median_price
## <dttm> <dbl>
## 1 2017-07-11 16:00:00 55.0
## 2 2017-07-11 18:00:00 100
## 3 2017-07-11 20:00:00 199
## 4 2017-07-11 22:00:00 450
## 5 2017-07-12 00:00:00 152
## 6 2017-07-12 02:00:00 285
## 7 2017-07-12 04:00:00 882
## 8 2017-07-12 06:00:00 40.0
# Clean and round down
airbnb %>%
collapse_by(period = "2 hour", clean = TRUE, side = "start") %>%
group_by(last_modified) %>%
summarise(median_price = median(price))
## # A time tibble: 8 x 2
## # Index: last_modified
## last_modified median_price
## <dttm> <dbl>
## 1 2017-07-11 14:00:00 55.0
## 2 2017-07-11 16:00:00 100
## 3 2017-07-11 18:00:00 199
## 4 2017-07-11 20:00:00 450
## 5 2017-07-11 22:00:00 152
## 6 2017-07-12 00:00:00 285
## 7 2017-07-12 02:00:00 882
## 8 2017-07-12 04:00:00 40.0
This works with ggmap v2.6.1 (CRAN) and ggplot2 v2.2.1 (CRAN). It is a bit finicky with earlier version / dev versions of either package.
airbnb_plot <- airbnb %>%
# Collapse and clean
collapse_by(period = "hour", clean = TRUE) %>%
# Throw out a few outliers
filter(
between(price, quantile(price, .05), quantile(price, .95))
) %>%
# Map and animate
qmplot(longitude, latitude, data = ., geom = "blank") +
geom_point(
aes(color = price, size = price, frame = last_modified),
alpha = .5) +
scale_color_continuous(low = "red", high = "blue")
## Using zoom = 11...
## Map from URL : http://tile.stamen.com/toner-lite/11/356/824.png
## Map from URL : http://tile.stamen.com/toner-lite/11/357/824.png
## Map from URL : http://tile.stamen.com/toner-lite/11/358/824.png
## Map from URL : http://tile.stamen.com/toner-lite/11/356/825.png
## Map from URL : http://tile.stamen.com/toner-lite/11/357/825.png
## Map from URL : http://tile.stamen.com/toner-lite/11/358/825.png
## Map from URL : http://tile.stamen.com/toner-lite/11/356/826.png
## Map from URL : http://tile.stamen.com/toner-lite/11/357/826.png
## Map from URL : http://tile.stamen.com/toner-lite/11/358/826.png
## Map from URL : http://tile.stamen.com/toner-lite/11/356/827.png
## Map from URL : http://tile.stamen.com/toner-lite/11/357/827.png
## Map from URL : http://tile.stamen.com/toner-lite/11/358/827.png
## Map from URL : http://tile.stamen.com/toner-lite/11/356/828.png
## Map from URL : http://tile.stamen.com/toner-lite/11/357/828.png
## Map from URL : http://tile.stamen.com/toner-lite/11/358/828.png
## Warning: `panel.margin` is deprecated. Please use `panel.spacing` property
## instead
## Warning: Ignoring unknown aesthetics: frame
gganimate(airbnb_plot)
data(FB, package = "tibbletime")
short_term_mean <- rollify(mean, window = 5)
long_term_mean <- rollify(mean, window = 50)
FB_roll <- FB %>%
mutate(short_mean = short_term_mean(adjusted),
long_mean = long_term_mean(adjusted))
FB_roll %>%
select(date, adjusted, short_mean, long_mean)
## # A tibble: 1,008 x 4
## date adjusted short_mean long_mean
## <date> <dbl> <dbl> <dbl>
## 1 2013-01-02 28.0 NA NA
## 2 2013-01-03 27.8 NA NA
## 3 2013-01-04 28.8 NA NA
## 4 2013-01-07 29.4 NA NA
## 5 2013-01-08 29.1 28.6 NA
## 6 2013-01-09 30.6 29.1 NA
## 7 2013-01-10 31.3 29.8 NA
## 8 2013-01-11 31.7 30.4 NA
## 9 2013-01-14 31.0 30.7 NA
## 10 2013-01-15 30.1 30.9 NA
## # ... with 998 more rows
Moving average plot
FB_roll %>%
gather(key = "Indicator", value = "value", short_mean, long_mean, adjusted) %>%
ggplot(aes(x = date, y = value, color = Indicator)) +
geom_line() +
labs(x = "Date", y = "Price", title = "FB Adjusted stock price with long/short term moving averages") +
theme_minimal()
## Warning: Removed 53 rows containing missing values (geom_path).
lm_roll <- rollify(
.f = ~ lm(.y ~ .x),
window = 5, unlist = FALSE)
FB_model <- FB %>%
mutate(
lag_volume = lag(volume),
model = lm_roll(lag_volume, adjusted)
)
FB_model
## # A tibble: 1,008 x 10
## symbol date open high low close volume adjusted lag_volume
## <chr> <date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 FB 2013-01-02 27.4 28.2 27.4 28.0 69846400 28.0 NA
## 2 FB 2013-01-03 27.9 28.5 27.6 27.8 63140600 27.8 69846400
## 3 FB 2013-01-04 28.0 28.9 27.8 28.8 72715400 28.8 63140600
## 4 FB 2013-01-07 28.7 29.8 28.6 29.4 83781800 29.4 72715400
## 5 FB 2013-01-08 29.5 29.6 28.9 29.1 45871300 29.1 83781800
## 6 FB 2013-01-09 29.7 30.6 29.5 30.6 104787700 30.6 45871300
## 7 FB 2013-01-10 30.6 31.5 30.3 31.3 95316400 31.3 104787700
## 8 FB 2013-01-11 31.3 32.0 31.1 31.7 89598000 31.7 95316400
## 9 FB 2013-01-14 32.1 32.2 30.6 31.0 98892800 31.0 89598000
## 10 FB 2013-01-15 30.6 31.7 29.9 30.1 173242600 30.1 98892800
## # ... with 998 more rows, and 1 more variable: model <list>
Looking at your model results with broom.
FB_model %>%
filter(!is.na(model)) %>%
mutate(glanced = map(model, glance)) %>%
select(date, glanced) %>%
unnest()
## # A tibble: 1,004 x 12
## date r.squared adj.r.squared sigma statistic p.value df logLik
## <date> <dbl> <dbl> <dbl> <dbl> <dbl> <int> <dbl>
## 1 2013-01-08 0.113 -0.330 0.817 0.255 0.664 2 -3.48
## 2 2013-01-09 0.326 0.102 0.972 1.45 0.314 2 -5.67
## 3 2013-01-10 0.0895 -0.214 1.19 0.295 0.625 2 -6.68
## 4 2013-01-11 0.130 -0.159 1.24 0.450 0.550 2 -6.91
## 5 2013-01-14 0.106 -0.193 1.11 0.354 0.594 2 -6.36
## 6 2013-01-15 0.0861 -0.219 0.691 0.282 0.632 2 -3.97
## 7 2013-01-16 0.426 0.235 0.693 2.23 0.233 2 -3.98
## 8 2013-01-17 0.180 -0.0932 0.808 0.659 0.476 2 -4.75
## 9 2013-01-18 0.0000962 -0.333 0.569 0.000289 0.988 2 -3.00
## 10 2013-01-22 0.0845 -0.221 0.447 0.277 0.635 2 -1.79
## # ... with 994 more rows, and 4 more variables: AIC <dbl>, BIC <dbl>,
## # deviance <dbl>, df.residual <int>
data(FANG, package = "tibbletime")
FANG_time <- FANG %>%
group_by(symbol) %>%
as_tbl_time(date)
slice(FANG_time, 1:2)
## # A time tibble: 8 x 8
## # Index: date
## # Groups: symbol [4]
## symbol date open high low close volume adjusted
## <chr> <date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 AMZN 2013-01-02 256 258 253 257 3271000 257
## 2 AMZN 2013-01-03 257 261 256 258 2750900 258
## 3 FB 2013-01-02 27.4 28.2 27.4 28.0 69846400 28.0
## 4 FB 2013-01-03 27.9 28.5 27.6 27.8 63140600 27.8
## 5 GOOG 2013-01-02 719 727 717 723 5101500 361
## 6 GOOG 2013-01-03 725 732 721 724 4653700 361
## 7 NFLX 2013-01-02 95.2 95.8 90.7 92.0 19431300 13.1
## 8 NFLX 2013-01-03 92.0 97.9 91.5 96.6 27912500 13.8
FANG_return <- FANG_time %>%
select(symbol, date, adjusted) %>%
calculate_return(adjusted, period = "daily") %>%
mutate(drawdown = drawdown(adjusted_return),
cum_ret = cumulative_return(adjusted_return))
FANG_return
## # A time tibble: 4,032 x 6
## # Index: date
## # Groups: symbol [4]
## symbol date adjusted adjusted_return drawdown cum_ret
## <chr> <date> <dbl> <dbl> <dbl> <dbl>
## 1 FB 2013-01-02 28.0 0 0 0
## 2 FB 2013-01-03 27.8 -0.00821 -0.00821 -0.00821
## 3 FB 2013-01-04 28.8 0.0356 0 0.0271
## 4 FB 2013-01-07 29.4 0.0229 0 0.0507
## 5 FB 2013-01-08 29.1 -0.0122 -0.0122 0.0379
## 6 FB 2013-01-09 30.6 0.0526 0 0.0925
## 7 FB 2013-01-10 31.3 0.0232 0 0.118
## 8 FB 2013-01-11 31.7 0.0134 0 0.133
## 9 FB 2013-01-14 31.0 -0.0243 -0.0243 0.105
## 10 FB 2013-01-15 30.1 -0.0275 -0.0511 0.0750
## # ... with 4,022 more rows
FANG_return_monthly <- FANG_return %>%
collapse_by("month") %>%
group_by(symbol, date) %>%
summarise(monthly_return = total_return(adjusted_return))
FANG_return_monthly
## # A time tibble: 192 x 3
## # Index: date
## # Groups: symbol [?]
## symbol date monthly_return
## <chr> <date> <dbl>
## 1 AMZN 2013-01-31 0.0318
## 2 AMZN 2013-02-28 -0.00463
## 3 AMZN 2013-03-28 0.00840
## 4 AMZN 2013-04-30 -0.0476
## 5 AMZN 2013-05-31 0.0606
## 6 AMZN 2013-06-28 0.0315
## 7 AMZN 2013-07-31 0.0847
## 8 AMZN 2013-08-30 -0.0672
## 9 AMZN 2013-09-30 0.113
## 10 AMZN 2013-10-31 0.164
## # ... with 182 more rows
Cumulative returns
plot_cum_ret <- FANG_return %>%
ggplot(aes(x = date, y = cum_ret, color = symbol)) +
geom_line() +
theme_tq() +
theme(axis.title.x = element_blank(),
axis.text.x = element_blank(),
axis.ticks.x = element_blank()) +
labs(
y = "Cumulative Return",
title = "Performance summary: Facebook, Amazon, Netflix, Google") +
theme(legend.position="none") +
scale_color_tq()
Monthly returns
plot_month_ret <- FANG_return_monthly %>%
ggplot(aes(x = date, y = monthly_return, fill = symbol)) +
geom_col(width = 15, position = position_dodge()) +
theme_tq() +
theme(axis.title.x = element_blank(),
axis.text.x = element_blank(),
axis.ticks.x = element_blank()) +
labs(y = "Monthly Return") +
theme(legend.position="none") +
scale_fill_tq()
Drawdown
plot_drawdown <- FANG_return %>%
ggplot(aes(x = date, y = drawdown, fill = symbol)) +
geom_area(position = position_identity(), alpha = .7) +
theme_tq() +
scale_x_date(
date_breaks = "3 months",
date_labels = "%b %Y") +
labs(x = "", y = "Drawdown") +
scale_fill_tq()
At this point you will need patchwork to run the following code, uncomment the lines below to first install patchwork from github. It should also install the dev version of ggplot2. Then you will likely need to restart R and rerun the code that involves creating the 3 FANG charts that will be added together (everything after the tidyfinance - FANG chunk). Do not try and run the map code again, as it will not work now that you have the dev version of ggplot2.
# # For performance summary plots
# devtools::install_github("thomasp85/patchwork", force = TRUE)
# library(patchwork)
#
# plot_cum_ret +
# plot_month_ret +
# plot_drawdown +
# plot_layout(ncol = 1, heights = c(2, 1, 1))